library(mosaic)
library(tidyverse)
library(pander)
library(DT)
library(ggrepel)
library(plotly)
library(dplyr)
library(ggplot2)
library(maps)
library(tmap)
library(leaflet)
library(htmltools)
library(car)
library(mosaicData)
library(ResourceSelection)
library(reshape2)
library(RColorBrewer)
library(scatterplot3d)
library(readr)
library(prettydoc)
library(knitr)
library(kableExtra)
library(formattable)
library(haven)
library(reshape2)
library(GGally)

Background

In this study, we will be exploring every aspect of residential homes in Ames, Iowa in order to predict the final price of each home.

Below, I start by mutating and determining which variables to utilize. Click the tab below to see that exploration.

Hide Data Exploration

Show Data Exploration

First, we’ll take a look at the pairs plot.


Alright, I’ll say it first, this thing is HUGE!! So, lets first consider some main criteria when it comes to picking a home and choose those specific variables to look at in the pairs plot.

When choosing variables, I had 3 main criteria I wanted to hit:

  1. Location
  2. Utilities/Space
  3. Appearance

With that, I chose these variables to look at the fit my criteria.

train <- read.csv("train.csv", stringsAsFactors = TRUE)

pairs(train [c("SalePrice", "GrLivArea","OverallQual", "TotalBsmtSF", "GarageCars", "GarageArea", "X1stFlrSF", "X2ndFlrSF", "Neighborhood", "YearBuilt","YearRemodAdd")])


In order to fit the data with those criteria in mind, I mutated the data to fit more columns into our model. I created these new columns:

  • TotalSF : The total surface area of the house including the all floors of the house (first, second, and basement) and the garage
  • LocationScore : captures the location quality based on two factors
    • Neighborhood (Most popular to least popular neighborhood)
    • Condition (Near a positive or negative feature from the house)
  • UtilityScore : based on the home’s usability
    • different features contributes a different score based on its importance/ how essential it is
  • TimeRemodel: The Year it was Sold - The Year it was Remodeled = Shows how many years have passed since it was last remodeled up to the year it sold
    • lower values = recently remodeled
    • hight values = older/outdated remodel
  • OverallScore: the average rating of the overall condition and overall quality/finish of the home


Overall, here are the variables I chose to use and what they can tell us in terms of this study.

# Load necessary library
library(knitr)

# Create a data frame with Variable, Description, and How it Helps Us
table_data <- data.frame(
  Variable = c("SalePrice", "TotalSF", "LocationScore", "UtilityScore", "TimeRemodel",
               "Neighborhood", "OverallScore", "Neighborhood:TotalSF"),
  
  Description = c("The final price at which the house was sold.",
                  "Total square footage of the house, including basement and garage.",
                  "A score that evaluates the desirability of the neighborhood and location conditions.",
                  "A score representing the house’s overall utility, considering space, features, and livability.",
                  "Number of years since the last remodeling or addition was completed.",
                  "The specific neighborhood in which the house is located.",
                  "An average of Overall Quality and Overall Condition ratings.",
                  "An interaction term that accounts for how the effect of total square footage varies across neighborhoods."),
  
  How_it_Helps_Us = c("Target variable we are trying to predict.",
                      "Bigger houses generally sell for more, making this a key predictor.",
                      "Homes in desirable locations tend to have higher sale prices.",
                      "Higher utility scores indicate more livable homes, increasing value.",
                      "More recently remodeled homes tend to sell for higher prices.",
                      "Neighborhood greatly influences home values due to amenities and demand.",
                      "Houses with better quality and condition typically sell for more.",
                      "Captures how the impact of house size varies depending on the neighborhood.")
)

# Print the table in a markdown-friendly format
kable(table_data, format = "markdown", col.names = c("Variable", "What it Looks at", "How it Helps Us"))
Variable What it Looks at How it Helps Us
SalePrice The final price at which the house was sold. Target variable we are trying to predict.
TotalSF Total square footage of the house, including basement and garage. Bigger houses generally sell for more, making this a key predictor.
LocationScore A score that evaluates the desirability of the neighborhood and location conditions. Homes in desirable locations tend to have higher sale prices.
UtilityScore A score representing the house’s overall utility, considering space, features, and livability. Higher utility scores indicate more livable homes, increasing value.
TimeRemodel Number of years since the last remodeling or addition was completed. More recently remodeled homes tend to sell for higher prices.
Neighborhood The specific neighborhood in which the house is located. Neighborhood greatly influences home values due to amenities and demand.
OverallScore An average of Overall Quality and Overall Condition ratings. Houses with better quality and condition typically sell for more.
Neighborhood:TotalSF An interaction term that accounts for how the effect of total square footage varies across neighborhoods. Captures how the impact of house size varies depending on the neighborhood.
train <- train %>%
  mutate(TotalSF = X1stFlrSF + X2ndFlrSF + TotalBsmtSF + GarageArea) %>%
  mutate(TotalRoom = FullBath + (HalfBath * 0.5) + BsmtFullBath + (BsmtHalfBath * 0.5) + KitchenAbvGr + BedroomAbvGr ) %>% # total amount of rooms in the house (bedrooms, bathrooms, etc.) 
   mutate(
    Utilities_score = case_when(
      Utilities == "AllPub" ~ 4,
      Utilities == "NoSewr" ~ 3,
      Utilities == "NoSeWa" ~ 2,
      Utilities == "ELO" ~ 1,
      TRUE ~ 0
    ),
    Street_score = case_when(
      Street == "Pave" ~ 1,
      Street == "Grvl" ~ 0,
      TRUE ~ 0
    ),
    Alley_score = case_when(
      Alley == "Pave" ~ 2,
      Alley == "Grvl" ~ 1,
      Alley == "NA" ~ 0,
      TRUE ~ 0
    ),
    LandSlope_score = case_when(
      LandSlope == "Gtl" ~ 2,
      LandSlope == "Mod" ~ 1,
      LandSlope == "Sev" ~ 0,
      TRUE ~ 0
    ),
    CentralAir_score = ifelse(CentralAir == "Y", 1, 0),
    PavedDrive_score = case_when(
      PavedDrive == "Y" ~ 2,
      PavedDrive == "P" ~ 1,
      PavedDrive == "N" ~ 0,
      TRUE ~ 0
    ),
    OverallQual_norm = OverallQual / 10,  # Scale from 1-10
    OverallCond_norm = OverallCond / 10,
    HeatingQC_score = case_when(
      HeatingQC == "Ex" ~ 5,
      HeatingQC == "Gd" ~ 4,
      HeatingQC == "TA" ~ 3,
      HeatingQC == "Fa" ~ 2,
      HeatingQC == "Po" ~ 1,
      TRUE ~ 0
    ),
    KitchenQual_score = case_when(
      KitchenQual == "Ex" ~ 5,
      KitchenQual == "Gd" ~ 4,
      KitchenQual == "TA" ~ 3,
      KitchenQual == "Fa" ~ 2,
      KitchenQual == "Po" ~ 1,
      TRUE ~ 0
    ),
    Functional_score = case_when(
      Functional == "Typ" ~ 5,
      Functional == "Min1" ~ 4,
      Functional == "Min2" ~ 3,
      Functional == "Mod" ~ 2,
      Functional == "Maj1" ~ 1,
      Functional == "Maj2" ~ 0,
      TRUE ~ 0
    )
  ) %>%
  
   mutate(
    UtilityScore = (0.15 * Utilities_score) +
                          (0.10 * GrLivArea) +
                          (0.07 * TotalBsmtSF) +
                          (0.06 * GarageArea) +
                          (0.05 * KitchenQual_score) +
                          (0.05 * HeatingQC_score) +
                          (0.05 * Functional_score) +
                          (0.04 * PavedDrive_score) +
                          (0.03 * Alley_score) +
                          (0.02 * Street_score) +
                          (0.02 * LandSlope_score) +
                          (0.02 * CentralAir_score) +
                          (0.05 * WoodDeckSF) +
                          (0.05 * OpenPorchSF)
  ) %>%
  
  mutate( # scores based on popularity fo outside look!
  HouseStyle = as.character(HouseStyle),
  HouseStyle = replace_na(HouseStyle, "None"),
  HouseStyle = as.factor(HouseStyle),
  HouseStyle_Score = case_when( # scored on popularity
    HouseStyle == "2.5Fin" ~ 8,
    HouseStyle == "2Story" ~ 7,
    HouseStyle == "1Story" ~ 6,
    HouseStyle == "SLvl" ~ 5,
    HouseStyle == "2.5Unf" ~ 4,
    HouseStyle == "1.5Fin" ~ 3,
    HouseStyle == "SFoyer" ~ 2,
    HouseStyle == "1.5Unf" ~ 1
  ), 
  LotShape = as.character(LotShape),
  LotShape = replace_na(LotShape, "None"),
  LotShape = as.factor(LotShape),
  LotShape_Score = case_when(
    LotShape == "Reg" ~ 4,
    LotShape == "IR1" ~ 3,
    LotShape == "IR2" ~ 2,
    LotShape == "IR3" ~ 1
    ),
  ExterQual = as.character(ExterQual),
  ExterQual = as.factor(ExterQual),
  ExterQual_Score = case_when(
    ExterQual == "Ex" ~ 5,
    ExterQual == "Gd" ~ 4,
    ExterQual == "TA" ~ 3,
    ExterQual == "Fa" ~ 2,
    ExterQual == "Po" ~ 1
  ),
  ExterCond = as.character(ExterCond),
  ExterCond = as.factor(ExterCond),
  ExterCond_Score = case_when(
    ExterCond == "Ex" ~ 5,
    ExterCond == "Gd" ~ 4,
    ExterCond == "TA" ~ 3,
    ExterCond == "Fa" ~ 2,
    ExterCond == "Po" ~ 1)
  ) %>%
  
  mutate(OverallScore = (OverallQual + OverallCond)/2) %>%
  
  mutate(
    LocationScore = case_when(
      Neighborhood %in% c("NoRidge", "NridgHt", "StoneBr", "Veenker") ~ 5,
      Neighborhood %in% c("NWAmes", "Somerst", "Timber", "ClearCr") ~ 4,
      Neighborhood %in% c("Sawyer", "SawyerW", "Edwards", "BrkSide") ~ 2,
      TRUE ~ 3
    ) + case_when(
      Condition1 %in% c("PosN", "PosA") | Condition2 %in% c("PosN", "PosA") ~ 2,
      Condition1 %in% c("Artery", "Feedr", "RRAn", "RRNe") | Condition2 %in% c("Artery", "Feedr", "RRAn", "RRNe") ~ -1,
      TRUE ~ 0
    )
  ) %>%
  
  mutate(PopularNbrHd = case_when(Neighborhood %in% c("NAmes", "CollgCr", "OldTown", "Edwards", "Somerst", "Gilbert", "NridgHt", "Sawyer", "NWAmes", "SawyerW" )~ 1,
                             TRUE ~ 0)) %>%
  
  mutate(TimeRemodel = YrSold - YearRemodAdd) %>%
  
  mutate(OutdoorScore = HouseStyle_Score + LotShape_Score + ExterQual_Score + ExterCond_Score)

house.lm <- lm(SalePrice ~ TotalSF + LocationScore + UtilityScore + TimeRemodel + Neighborhood + Neighborhood:TotalSF + OverallScore, data=train)
pairs(train [c("SalePrice", "TotalSF", "TimeRemodel", "UtilityScore", "Neighborhood", "LocationScore", "OverallScore")],panel=panel.smooth)


Visuals

The visuals below will look at how each variable effect and interact each other when it comes to predicting SalePrice.

Some of them at a glance will be difficult to read, thus a subset of each graph will be given to look at each factor individually. Click through the tabs to see each visual.

Total Surface Area

Neighborhood

Key Findings:

  • Steeper slopes show a stronger impact TotalSF has on SalesPrices
  • More expensive neighborhoods show to have higher SalePrices at any given TotalSF(ex. StoneBr)
TSA.N <- ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(Neighborhood))) +
  geom_point() +
  geom_line(aes(y = house.lm$fit, group = interaction(LocationScore)), cex = 0.5) +
  theme_minimal()

ggplotly(TSA.N)
ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(Neighborhood))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() + 
  facet_wrap(~Neighborhood)



Location Score

Key Findings:

  • Between the different LocationScore categories, they all seem to be within the same range of square footage (they all look a big more clustered within a specific range, despite some outliers)
  • The main change is the slope of each category
    • ex. LocationScore 5 has a steeper slope, thus the price increases more rapidly as square footage increases in comparison to LocationScore 4
TSA.LS <- ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(LocationScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(TSA.LS)
ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(LocationScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~LocationScore)



Overall Score

Key Findings:

  • For OverallScore values between 3-7 that as the quality of a home improves, the SalePrice per TotalSF increases prices of homes
    • Though, at OverallScore value of 7.5, there are instances where a 7.5 ranking house with a large square footage sold very low and a lower square footage home sold very high, so this graph’s inperpretation is a bit confusing
TSA.OS <- ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(OverallScore))) +
  geom_point(size=1) +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(TSA.OS)
ggplot(train, aes(y = SalePrice, x = TotalSF, color = factor(OverallScore))) +
  geom_point(size=1) +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~OverallScore)



Time Since Remodeling

Neighborhood

Key Findings:

  • Some of the Neighorhood categories (Blmngtn, NridgHt, Somerst, etc.) show that their homes are renovated at lot more recently based on how small their TimeRemodel values are
    • Additionally, their steep slows show that the more recent a house is renovated, the higher the SalePrice
TSR.N <- ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(Neighborhood))) +
  geom_point(size=1) +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(TSR.N)
ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(Neighborhood))) +
  geom_point(size=1) +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~Neighborhood)



Location Score

Key Findings:

  • For the LocationScore of 5, the more recent something is renovated, the SalePrice increases based of that desirable location
    • if it hasn’t been renovated very recently, the value of the SalePrice decreases based on that location
  • However, the majority of LocationScore scores show that regardless of how late or how recent the home was renovated, the SalePrice of the home stays fairly consistent or is a slight decrease, despite the LocationScore
TSR.LS <- ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(LocationScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(TSR.LS)
ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(LocationScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~LocationScore)



Overall Score

Key Findings:

Here’s a clearer way to express those observations:

  • The SalePrice patterns mirror what we observed in the LocationScore graphs for homes with OverallScore ratings between 4-6
    • The timing of renovations (whether recent or delayed) appears to have minimal impact on SalePrice, regardless of the home’s OverallScore
  • Interestingly, buyers show a preference for homes with lower OverallScore ratings, even when comparing recently and previously renovated properties
    • This suggests that a home’s fundamental quality and condition may compensate for less frequent maintenance
TSR.OS <- ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(OverallScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(TSR.OS)
ggplot(train, aes(y = SalePrice, x = TimeRemodel, color = factor(OverallScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~OverallScore)



Utility Score

Neighborhood

Key Findings:

  • More expensive Neighborhood (StoneBr, NridgHt, etc.) show a steeper slope, showing that the higher the UtilityScore results in a higher SalePrice (expensive places have more things)
    • On the other end, the cheaper neighborhoods have smaller a UtilityScore and thus a smaller SalePrice (cheaper places don’t have that much stuff)
U.N <- ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(Neighborhood))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(U.N)
ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(Neighborhood))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~Neighborhood)



Location Score

Key Findings:

  • UtilityScore seems to be prioritized over LocationScore
    • Regardless of the location being more or less than ideal, as UtilityScore increases as well as the SalePrice
U.LS <- ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(LocationScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(U.LS)
ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(LocationScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~LocationScore)



Overall Score

Key Findings:

  • As the OverallScore goes from smallest to largest, we can see that the increased UtilityScore increases along with the SalePrice
    • Thus, SalePrice as UtilityScore increases with every increase of OverallScore
U.OS <- ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(OverallScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal()

ggplotly(U.OS)
ggplot(train, aes(y = SalePrice, x = UtilityScore, color = factor(OverallScore))) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x, se = FALSE, size = 2) +
  theme_minimal() +
  facet_wrap(~OverallScore)



Regression Model

Now, we will put the model to use through testing as well as interpretation.

Mathematical Equation

This is the mathematical model of which my regression model is based off of. The following shows:

\[ \underbrace{Y_i}_{SalePrice} = \beta_0 + \beta_1\underbrace{X_{i1}}_{TotalSF}+ \beta_2\underbrace{X_{i2}}_{LocationScore} + \beta_3\underbrace{X_{i3}}_{UtilityScore} + \beta_4\underbrace{X_{i4}}_{TimeRemodel} + \beta_5\underbrace{X_{i5}}_{Neighborhood} + \beta_6\underbrace{X_{i6}}_{OverallScore} + \beta_7\underbrace{X_{i5}X_{i1}}_{Neighborhood:TotalSF} + \epsilon_i \text{ where N(0, }\sigma^2) \]

  • there being different levels of Neighborhood, LocationScore, and OverallScore within the model.

Proceed to the next tab to see our results



Linear Regression

After looking at this regression, the most significant variables are:

  • LocationScore(0.001629) : As the location quality increases, the predicted sale price of a home would increase by $5480 per location score.

  • UtilityScore(7.724e-16) : As quality and quantity of utilities increase, the predicted sale price of a home would increase by $525.90 per utility score.

  • TimeRemodel (1.46e-05) : As the increase in year since remodeling/ renovating a home, the predicted sale price would decrease $213.10 per year.

  • OverallScore(1.221e-48) : As the overall quality and condition of a home increases, the sale price of a home would increase by $16951 per overall score.

  • NeighborhoodStoneBr(0.04949) : When selling the houses in the Stone Brooke Neighborhood, they sell $164471 less compared to average neighborhood. Therefore, other neighborhoods might be more desirable or houses in this neighborhood could be more open to bargaining.

  • TotalSF:NeighborhoodEdwards (0.02362) : For each additional surface area of house in the Edwards neighborhood, the sale price decreases $55.22 per surface area. (This due to the result of a house big house [13170 \(f^2\) costing $160,000] costing less than smaller houses that cost a bigger amount of money)

  • TotalSF:NeighborhoodStoneBr (0.03662) : For each additional surface area of house in the Stone Brooke neighborhood, the sale price increases by $52.54 per surface area. While the neighborhood’s houses itself can become fairly cheap, that still doesn’t take away from the fact that when house size is involved that sale prices can increase.

The individual results will vary by as much as 53636.


The insignificance of some of these variables could be due to:

  • the favoring of one feature over another (ex. buyers care more about location over total surface area)
  • the similar trends/impact of multiple categories (ex. NeighborhoodClearCr and NeighborhoodDOTRR display simlar trends and don’t differentiate as much as NeighborhoodStoneBr)
# Original (train data)
houseO.lm <- lm(SalePrice ~ TotalSF + as.factor(LocationScore) + UtilityScore + TimeRemodel + as.factor(Neighborhood) + as.factor(OverallScore) + Neighborhood:TotalSF , data=train)
summary(houseO.lm) %>% pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -92979 82601 -1.126 0.2605
TotalSF 23.1 24.51 0.9424 0.3462
as.factor(LocationScore)2 13100 4186 3.129 0.001789
as.factor(LocationScore)3 19107 5264 3.63 0.0002939
as.factor(LocationScore)4 17613 7427 2.372 0.01785
as.factor(LocationScore)5 24780 8571 2.891 0.003898
as.factor(LocationScore)6 24724 11797 2.096 0.03629
as.factor(LocationScore)7 -25296 28562 -0.8857 0.376
UtilityScore 465.6 64.51 7.217 8.721e-13
TimeRemodel -208.4 48.7 -4.278 2.013e-05
as.factor(Neighborhood)Blueste 16111 212177 0.07593 0.9395
as.factor(Neighborhood)BrDale 37853 90115 0.42 0.6745
as.factor(Neighborhood)BrkSide 61898 79202 0.7815 0.4346
as.factor(Neighborhood)ClearCr 50776 84841 0.5985 0.5496
as.factor(Neighborhood)CollgCr 30332 78597 0.3859 0.6996
as.factor(Neighborhood)Crawfor 48508 79432 0.6107 0.5415
as.factor(Neighborhood)Edwards 138769 78279 1.773 0.07649
as.factor(Neighborhood)Gilbert 37231 80184 0.4643 0.6425
as.factor(Neighborhood)IDOTRR 18342 80963 0.2266 0.8208
as.factor(Neighborhood)MeadowV 80167 80315 0.9982 0.3184
as.factor(Neighborhood)Mitchel 64982 79508 0.8173 0.4139
as.factor(Neighborhood)NAmes 85739 78315 1.095 0.2738
as.factor(Neighborhood)NoRidge -141931 80776 -1.757 0.07912
as.factor(Neighborhood)NPkVill 96151 184902 0.52 0.6031
as.factor(Neighborhood)NridgHt -103230 80188 -1.287 0.1982
as.factor(Neighborhood)NWAmes 80122 80435 0.9961 0.3194
as.factor(Neighborhood)OldTown 61370 78514 0.7817 0.4346
as.factor(Neighborhood)Sawyer 89864 79401 1.132 0.2579
as.factor(Neighborhood)SawyerW 41832 79331 0.5273 0.5981
as.factor(Neighborhood)Somerst 16677 79690 0.2093 0.8343
as.factor(Neighborhood)StoneBr -156328 82479 -1.895 0.05825
as.factor(Neighborhood)SWISU 73287 80895 0.9059 0.3651
as.factor(Neighborhood)Timber 1446 81612 0.01772 0.9859
as.factor(Neighborhood)Veenker -133682 95439 -1.401 0.1615
as.factor(OverallScore)2 52988 39148 1.354 0.1761
as.factor(OverallScore)2.5 31556 30040 1.05 0.2937
as.factor(OverallScore)3 9263 29503 0.314 0.7536
as.factor(OverallScore)3.5 39540 27525 1.437 0.1511
as.factor(OverallScore)4 34736 27009 1.286 0.1986
as.factor(OverallScore)4.5 44353 26723 1.66 0.09719
as.factor(OverallScore)5 52622 26649 1.975 0.04851
as.factor(OverallScore)5.5 58594 26633 2.2 0.02797
as.factor(OverallScore)6 64095 26655 2.405 0.01632
as.factor(OverallScore)6.5 73519 26708 2.753 0.005988
as.factor(OverallScore)7 90897 26805 3.391 0.0007159
as.factor(OverallScore)7.5 96859 27250 3.554 0.0003916
as.factor(OverallScore)8 116979 27890 4.194 2.91e-05
as.factor(OverallScore)8.5 119578 31069 3.849 0.0001241
as.factor(OverallScore)9.5 245998 33922 7.252 6.806e-13
TotalSF:NeighborhoodBlueste -9.242 82.49 -0.112 0.9108
TotalSF:NeighborhoodBrDale -20.7 32.61 -0.6349 0.5256
TotalSF:NeighborhoodBrkSide -22.76 24.5 -0.9288 0.3531
TotalSF:NeighborhoodClearCr -12.99 25.71 -0.5051 0.6135
TotalSF:NeighborhoodCollgCr -6.758 24.06 -0.2809 0.7789
TotalSF:NeighborhoodCrawfor -12.4 24.3 -0.5104 0.6098
TotalSF:NeighborhoodEdwards -54.87 23.95 -2.291 0.02212
TotalSF:NeighborhoodGilbert -7.09 24.67 -0.2874 0.7738
TotalSF:NeighborhoodIDOTRR -13.37 25.76 -0.5189 0.6039
TotalSF:NeighborhoodMeadowV -40.73 25.58 -1.592 0.1116
TotalSF:NeighborhoodMitchel -23.7 24.44 -0.9698 0.3323
TotalSF:NeighborhoodNAmes -33.03 24.01 -1.376 0.169
TotalSF:NeighborhoodNoRidge 36.19 24.23 1.493 0.1356
TotalSF:NeighborhoodNPkVill -37.68 68.07 -0.5535 0.58
TotalSF:NeighborhoodNridgHt 33.52 24.22 1.384 0.1666
TotalSF:NeighborhoodNWAmes -28.27 24.53 -1.152 0.2493
TotalSF:NeighborhoodOldTown -33.77 24.11 -1.401 0.1614
TotalSF:NeighborhoodSawyer -32.9 24.51 -1.343 0.1796
TotalSF:NeighborhoodSawyerW -9.833 24.27 -0.4051 0.6855
TotalSF:NeighborhoodSomerst 1.446 24.27 0.05956 0.9525
TotalSF:NeighborhoodStoneBr 51.08 24.69 2.069 0.03873
TotalSF:NeighborhoodSWISU -35.39 24.99 -1.416 0.157
TotalSF:NeighborhoodTimber 4.267 24.66 0.173 0.8627
TotalSF:NeighborhoodVeenker 45.63 28.78 1.586 0.113
Fitting linear model: SalePrice ~ TotalSF + as.factor(LocationScore) + UtilityScore + TimeRemodel + as.factor(Neighborhood) + as.factor(OverallScore) + Neighborhood:TotalSF
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1460 26353 0.8954 0.89



Model Validation

Now, we will validate our model. The verification of this model will help us know that the model fit on this one sample of data will continue to fit well on a new sample of data. This will be verified through the Validation Adjusted \(R^2\). This is calculated with the code below and presented with the other \(R^2\) values for comparison:

set.seed(12242003)


num_rows <- min(1000, nrow(train)) #1460 total
keep <- sample(1:nrow(train), num_rows)

mytrain <- train[keep, ] #Use this in the lm(..., data=mytrain) it is like "rbdata"

mytest <- train[-keep, ] #Use this in the predict(..., newdata=mytest) it is like "rbdata2"


# lms to check w/ mytrain for data = 
house.lm <- lm(SalePrice ~ TotalSF + LocationScore + UtilityScore + TimeRemodel + Neighborhood + Neighborhood:TotalSF + OverallScore, data=mytrain)

# predict codes w/ mytest for newdata=
yh_myhouse <- predict(house.lm, newdata=mytest)

if (sum(is.na(yh_myhouse)) > 0) {
    yh_myhouse[is.na(yh_myhouse)] <- mean(mytrain$SalePrice, na.rm = TRUE)  # Replace NAs with mean
}

ybar <- mean(mytest$SalePrice)

SSTO <- sum( (mytest$SalePrice - ybar)^2 )

SSE_myhouse <- sum( (mytest$SalePrice - yh_myhouse)^2 )

rs_hd <- 1 - SSE_myhouse/SSTO


n <- nrow(mytest)
p_myhouse <- length(house.lm$coefficients)


rsa_myhouse <- 1 - (n-1)/(n-p_myhouse)*SSE_myhouse/SSTO


house.table <- data.frame(`Original R2` = summary(house.lm)$r.squared, `Orig. Adj. R-squared` = summary(house.lm)$adj.r.squared, `Validation R-squared` = rs_hd, `Validation Adj. R^2` = rsa_myhouse)

colnames(house.table) <- c("Original $R^2$", "Original Adj. $R^2$", "Validation $R^2$", "Validation Adj. $R^2$")

knitr::kable(house.table, escape=TRUE, digits=4)
Original \(R^2\) Original Adj. \(R^2\) Validation \(R^2\) Validation Adj. \(R^2\)
0.8929 0.887 0.8739 0.8574

As we can see, the drop from the Original Adjusted \(R^2\) to the Validation Adjusted \(R^2\) goes from 0.8870 to 0.8574. Thus, with a difference of just 0.0296, we can see that the model captures the essence of the data fairly well and shows no signs of over fitting.


plot(SalePrice ~ ., data=train2)

b <- coef(househd) b

paste0(“b[“, 1:length(b),”]*“, names(b), collapse=”+“)

ggplot(househd, aes(x=GarageArea, y=SalePrice, color=interaction(Alley,FullBath))) + geom_point() + stat_function(fun=function(GarageArea, TotalSF=1200, LotArea=10000, AlleyNone=1, AlleyPave=0, FullBath=2, ScreenPorch=0) b[1]+b[2]TotalSF+b[3]LotArea+b[4]GarageArea+b[5]AlleyNone+b[6]AlleyPave+b[7]FullBath+b[8]*ScreenPorch) + facet_wrap(~interaction(Alley,FullBath)) househd <- lm(SalePrice ~ TotalSF + LotArea + GarageArea + Alley + FullBath + ScreenPorch, data=train) summary(househd)